from pyspark import SparkConf, SparkContext

if __name__ == '__main__':
    # 构建SparkConf对象
    conf = SparkConf().setAppName("test").setMaster("local[*]")
    # 构建SparkContext执行环境入口对象
    sc = SparkContext(conf=conf)

    rdd = sc.parallelize([(1001, "zhangsan"), (1002, "zhaosi"), (1003, "wangwu"), (1004, "zhaoliu")])
    rdd2 = sc.parallelize([(1001, "科技部"), (1002, "销售部")])

    rdd3 = rdd.join(rdd2)
    print(rdd3.collect())
    """
    join:可实现SQL的内\外连接
    注意：只能用于二元元组，并且默认是以key作为连接条件
    
    rdd.join(other_rdd) #内连接
    rdd.leftOuterJoin(other_rdd) #左外连接
    rdd.righttOuterJoin(other_rdd) #右外连接
    """

    rdd4 = rdd.leftOuterJoin(rdd2)
    print(rdd4.collect())
